#################
# Gu and Zhou (2015), "Education as Institutionalized Diffusion Path"
# Gov 2001 Course Replication Paper, April 2015
# All rights reserved.
# data source: http://sda.berkeley.edu/sdaweb/analysis/?dataset=gss12

#install.packages("lme4")
#install.packages("VGAM")
#install.packages("ZeligChoice", repos="http://r.iq.harvard.edu/", type="source")
library(Zelig)
library(ZeligChoice)
library(lme4)
require(car)

rm(list=ls())
setwd ("/Users/yanhuazhou/Dropbox/Coursework/Gov2001_stats/GOV 2001_replication/Environmental spending/GSS data")
data=read.csv("data.csv", header=TRUE)

setwd ("/Users/ygu/Dropbox/GOV 2001_replication/Environmental spending/GSS data")
data=read.csv("data.csv", header=TRUE)

#### SAMPLE
# year: 1972-2008    delete obs of year 2010-2012
# age: >=25,  delete obs younger than 25
table(data$YEAR)
table (data$AGE) # 89  89 OR OLDER;  98  DK ; 99  NA
data0<-subset(data, data$YEAR<"2010"& data$AGE>="25")
data2<-subset(data0, data0$AGE!=98 & data0$AGE!=99)
sum(is.na(data2$YEAR))

data2$AgeGroup<- recode(data2$AGE,
                        "25:29=1;
                        30:34=2;
                        35:39=3;
                        40:44=4;
                        45:49=5;
                        50:54=6;
                        55:59=7;
                        60:64=8;
                        65:69=9;
                        70:74=10;
                        75:79=11;
                        80:89=12")
table(data2$AgeGroup)
summary(data2$YEAR)

row_age<-cbind(sum((1-is.na(data2$AGE)),na.rm=TRUE),
               mean(data2$AGE,na.rm=TRUE),
               sd(data2$AGE,na.rm=TRUE),
               min(data2$AGE,na.rm=TRUE),
               max(data2$AGE,na.rm=TRUE))
row_age<-round(row_age,digits = 1)

#### calculate DV: ESS= ES/SPENDING(MEAN)
## ALL ITEMS FOR NATIONAL SPENDING
# 400 NATSPAC (Space exploration program) 
# 401 NATENVIR (Improving and protecting the environment)
# 402 NATHEAL (IMPROVING & PROTECTING NATIONS HEALTH)
# 403 NATCITY (Solving the problems of the big cities)
# 404 NATCRIME (Halting the rising crime rate.)
# 405 NATDRUG (Dealing with drug addiction.)
# 406 NATEDUC (Improving the nation's education system.)
# 407 NATRACE (Improving the conditions of Blacks.)
# 408 NATARMS (The military,armaments and defense.)
# 409 NATAID (Foreign aid)
# 410 NATFARE (Welfare)
# 411 NATROAD (Highways and bridges)
# 412 NATSOC (Social Security)
# 413 NATMASS (Mass Transportation)
# 414 NATPARK (Parks and recreation)
# 415 NATCHLD (Assistance for childcare.)
# 416 NATSCI (Supporting scientific research)
# 417 NATENRGY (Developing alternative energy sources)

table(data2$NATENVIR)  # 0, IAP(inapplicable); 8, DK(don't know); 9, NA(not answer)
# 1  TOO LITTLE    2  ABOUT RIGHT    3  TOO MUCH

sum(is.na(data2$NATENVIR))

data2$NATENVIR[data2$NATENVIR==0|data2$NATENVIR==8|data2$NATENVIR==9] <- NA
data2$NATSPAC[data2$NATSPAC==0|data2$NATSPAC==8|data2$NATSPAC==9] <- NA
data2$NATHEAL[data2$NATHEAL==0|data2$NATHEAL==8|data2$NATHEAL==9] <- NA
data2$NATCITY[data2$NATCITY==0|data2$NATCITY==8|data2$NATCITY==9] <- NA
data2$NATCRIME[data2$NATCRIME==0|data2$NATCRIME==8|data2$NATCRIME==9] <- NA
data2$NATDRUG[data2$NATDRUG==0|data2$NATDRUG==8|data2$NATDRUG==9] <- NA
data2$NATEDUC[data2$NATEDUC==0|data2$NATEDUC==8|data2$NATEDUC==9] <- NA
data2$NATRACE[data2$NATRACE==0|data2$NATRACE==8|data2$NATRACE==9] <- NA
data2$NATARMS[data2$NATARMS==0|data2$NATARMS==8|data2$NATARMS==9] <- NA
data2$NATAID[data2$NATAID==0|data2$NATAID==8|data2$NATAID==9] <- NA
data2$NATFARE[data2$NATFARE==0|data2$NATFARE==8|data2$NATFARE==9] <- NA
data2$NATROAD[data2$NATROAD==0|data2$NATROAD==8|data2$NATROAD==9] <- NA
data2$NATSOC[data2$NATSOC==0|data2$NATSOC==8|data2$NATSOC==9] <- NA
data2$NATMASS[data2$NATMASS==0|data2$NATMASS==8|data2$NATMASS==9] <- NA
data2$NATPARK[data2$NATPARK==0|data2$NATPARK==8|data2$NATPARK==9] <- NA
data2$NATCHLD[data2$NATCHLD==0|data2$NATCHLD==8|data2$NATCHLD==9] <- NA
data2$NATSCI[data2$NATSCI==0|data2$NATSCI==8|data2$NATSCI==9] <- NA
data2$NATENRGY[data2$NATENRGY==0|data2$NATENRGY==8|data2$NATENRGY==9] <- NA


data2$all.spending<- rowMeans(data2[,400:417], na.rm=TRUE)
data2$all.spending<-4-data2$all.spending
length(data2$all.spending)
summary(data2$all.spending)

data2$absolute.support<- data2$NATENVIR
data2$absolute.support <- 4-data2$absolute.support
summary(data2$absolute.support)

absolutesupport<-as.matrix(data2$absolute.support)
row_absolutesupport<-cbind(sum((1-is.na(absolutesupport)),na.rm=TRUE),
                           mean(absolutesupport,na.rm=TRUE),
                           sd(absolutesupport,na.rm=TRUE),
                           min(absolutesupport,na.rm=TRUE),
                           max(absolutesupport,na.rm=TRUE))
row_absolutesupport<-round(row_absolutesupport,digits = 2)

data2$relative.support<- data2$absolute.support/data2$all.spending
summary(data2$relative.support, na.rm=T)

relativesupport<-as.matrix(data2$relative.support)
sum((1-is.na(relativesupport)))
mean(relativesupport,na.rm=TRUE)
sd(relativesupport,na.rm=TRUE)
min(relativesupport,na.rm=TRUE)
max(relativesupport,na.rm=TRUE)
row_relativesupport<-cbind(sum((1-is.na(relativesupport)),na.rm=TRUE),
                           mean(relativesupport,na.rm=TRUE),
                           sd(relativesupport,na.rm=TRUE),
                           min(relativesupport,na.rm=TRUE),
                           max(relativesupport,na.rm=TRUE))
row_relativesupport<-round(row_relativesupport,digits = 2)


# cohort
table(data2$COHORT)
table(data$COHORT)
data2$COHORT[data2$COHORT==9999] <- NA
sum(is.na(data2$COHORT))
data2$COHORT[data2$COHORT<1900] <- 1900
data2$COHORT[data2$COHORT>1982] <- 1982

data2$cohort <- (data2$COHORT-1900)/10
data2$cohort_sqr <- data2$cohort^2

summary(data2$cohort_sqr)


row_cohort <- cbind(sum((1-is.na(data2$COHORT)),na.rm=TRUE),
                    mean(data2$COHORT,na.rm=TRUE),
                    sd(data2$COHORT,na.rm=TRUE),
                    min(data2$COHORT,na.rm=TRUE),
                    max(data2$COHORT,na.rm=TRUE))

row_cohort<-round(row_cohort,digits = 0)

# cohort.group        # 1900-1929, 1930-1949, 1950-1969, 1970-1988
#install.packages("car")
require(car)
data2$cohort.group<- recode(data2$COHORT,
                            "1900:1929=1;
                            1930:1949=2;
                            1950:1969=3;
                            1970:1988=4")
table(data2$cohort.group)

byc_age<-aggregate(data2$AGE, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_age<-as.matrix(byc_age[,2])
byc_age<-round(byc_age,digits = 2)


row_year<-cbind(sum((1-is.na(data2$YEAR)),na.rm=TRUE),
                mean(data2$YEAR,na.rm=TRUE),
                sd(data2$YEAR,na.rm=TRUE),
                min(data2$YEAR,na.rm=TRUE),
                max(data2$YEAR,na.rm=TRUE))
row_year<-round(row_year,digits = 0)

byc_year<-aggregate(data2$YEAR, by=list(Cohort_group=data2$cohort.group), mean, na.rm=TRUE)
byc_year<-as.matrix(byc_year[,2])
byc_year<-round(byc_year,digits = 0)

allspending<-as.matrix(data2$all.spending)
row_allspending<-cbind(sum((1-is.na(allspending)),na.rm=TRUE),
                       mean(allspending,na.rm=TRUE),
                       sd(allspending,na.rm=TRUE),
                       min(allspending,na.rm=TRUE),
                       max(allspending,na.rm=TRUE))
nrow(na.omit(data2$all.spending))

row_allspending<-round(row_allspending,digits = 2)

byc_allspending<-aggregate(data2$all.spending, by=list(Cohort_group=data2$cohort.group), mean, na.rm=TRUE)
byc_allspending<-as.matrix(byc_allspending[,2])
byc_allspending<-round(byc_allspending,digits = 2)

byc_absolutesupport<-aggregate(data2$absolute.support, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_absolutesupport<-as.matrix(byc_absolutesupport[,2])
byc_absolutesupport<-round(byc_absolutesupport,digits = 2)

byc_relativesupport<-aggregate(data2$relative.support, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_relativesupport<-as.matrix(byc_relativesupport[,2])
byc_relativesupport<-round(byc_relativesupport,digits = 2)

byc_cohort<-aggregate(data2$COHORT, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_cohort<-as.matrix(byc_cohort[,2])
byc_cohort<-round(byc_cohort,digits = 0)

# sex
summary(data2$SEX) # 1  MALE  2  FEMALE -> 0
data2$male<- data2$SEX  
data2$male[data2$male==2]<-0
table(data2$male)

row_male <- cbind(sum((1-is.na(data2$male)),na.rm=TRUE),
                  mean(data2$male,na.rm=TRUE),
                  sd(data2$male,na.rm=TRUE),
                  min(data2$male,na.rm=TRUE),
                  max(data2$male,na.rm=TRUE))
row_male<-round(row_male,digits = 2)

byc_male<-aggregate(data2$male, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_male<-as.matrix(byc_male[,2])
byc_male<-round(byc_male,digits = 2)


# race
summary(data2$RACE) # 0  IAP  1  WHITE   2  BLACK  3  OTHER
data2$RACE[data2$RACE==0] <- NA
table(data2$RACE)
data2$race.black<- data2$RACE
data2$race.black[data2$race.black==1|data2$race.black==3]<- 0
data2$race.black[data2$race.black==2]<- 1
summary(data2$race.black)
table(data2$race.black)

row_raceb <- cbind(sum((1-is.na(data2$race.black)),na.rm=TRUE),
                   mean(data2$race.black,na.rm=TRUE),
                   sd(data2$race.black,na.rm=TRUE),
                   min(data2$race.black,na.rm=TRUE),
                   max(data2$race.black,na.rm=TRUE))
row_raceb<-round(row_raceb,digits = 2)

byc_raceb<-aggregate(data2$race.black, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_raceb<-as.matrix(byc_raceb[,2])
byc_raceb<-round(byc_raceb,digits = 2)

data2$race.other<-data2$RACE
data2$race.other[data2$race.other==1|data2$race.other==2]<-0
data2$race.other[data2$race.other==3]<-1
table(data2$race.other)
summary(data2$race.other)

row_raceo <- cbind(sum((1-is.na(data2$race.other)),na.rm=TRUE),
                   mean(data2$race.other,na.rm=TRUE),
                   sd(data2$race.other,na.rm=TRUE),
                   min(data2$race.other,na.rm=TRUE),
                   max(data2$race.other,na.rm=TRUE))
row_raceo<-round(row_raceo,digits = 2)

byc_raceo<-aggregate(data2$race.other, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_raceo<-as.matrix(byc_raceo[,2])
byc_raceo<-round(byc_raceo,digits = 2)

# education
summary(data2$EDUC) # 97  IAP    98  DK     99  NA
table(data2$EDUC)
data2$EDUC[data2$EDUC==98|data2$EDUC==99] <- NA
data2$education<-data2$EDUC/10
summary(data2$education)

row_edu <- cbind(sum((1-is.na(data2$education)),na.rm=TRUE),
                 mean(data2$education,na.rm=TRUE),
                 sd(data2$education,na.rm=TRUE),
                 min(data2$education,na.rm=TRUE),
                 max(data2$education,na.rm=TRUE))
row_edu<-round(row_edu,digits = 2)

byc_edu<-aggregate(data2$education, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_edu<-as.matrix(byc_edu[,2])
byc_edu<-round(byc_edu,digits = 2)

table(data2$cohort)
summary(data2$cohort)
aggregate(data2$education, by=list(Cohort_aggr=data2$cohort), mean,na.rm=TRUE)

#Calculate edu.center: demean  group-mean  

require(plyr)
aggr.educ<-aggregate(data2$education, by=list(data2$cohort),mean,na.rm=TRUE)
aggr.educ<-rename(aggr.educ, c(Group.1="cohort"))
aggr.educ<-rename(aggr.educ, c(x="cohort.educ"))

data2<-merge(data2, aggr.educ, by="cohort")
data2$educ.center<-data2$education-data2$cohort.educ
length(data2$educ.center)   # Question: why did we lose one obs: 47259--> 47260

# marriage
table(data2$MARITAL) # 1  MARRIED  2  WIDOWED 3  DIVORCED
# 4  SEPARATED 5  NEVER MARRIED 9  NA
data2$MARITAL[data2$MARITAL==9] <- NA
data2$MARITAL[data2$MARITAL!=1] <- 0
table(data2$MARITAL)

row_married <- cbind(sum((1-is.na(data2$MARITAL)),na.rm=TRUE),
                     mean(data2$MARITAL,na.rm=TRUE),
                     sd(data2$MARITAL,na.rm=TRUE),
                     min(data2$MARITAL,na.rm=TRUE),
                     max(data2$MARITAL,na.rm=TRUE))
row_married<-round(row_married,digits = 2)

byc_married<-aggregate(data2$MARITAL, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_married<-as.matrix(byc_married[,2])
byc_married<-round(byc_married,digits = 2)


# children
table(data2$CHILDS)    # 8  EIGHT OR MORE   9  DK NA
data2$CHILDS[data2$CHILDS==9] <- NA
summary(data2$CHILDS)

row_child <- cbind(sum((1-is.na(data2$CHILDS)),na.rm=TRUE),
                   mean(data2$CHILDS,na.rm=TRUE),
                   sd(data2$CHILDS,na.rm=TRUE),
                   min(data2$CHILDS,na.rm=TRUE),
                   max(data2$CHILDS,na.rm=TRUE))
row_child<-round(row_child,digits = 2)

byc_child<-aggregate(data2$CHILDS, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_child<-as.matrix(byc_child[,2])
byc_child<-round(byc_child,digits = 2)


# working status
table(data2$WRKSTAT)
# 0  IAP ; 1  WORKING FULLTIME; 2  WORKING PARTTIME; 3  TEMP NOT WORKING; 
# 4  UNEMPL, LAID OFF; 5  RETIRED; 6  SCHOOL; 7  KEEPING HOUSE;  8  OTHER   9  NA
data2$WRKSTAT[data2$WRKSTAT==9] <- NA
data2$WRKSTAT[data2$WRKSTAT==1 | data2$WRKSTAT==2 | data2$WRKSTAT==6] <- 1
data2$WRKSTAT[data2$WRKSTAT!=1] <- 0
summary(data2$WRKSTAT)

row_empl <- cbind(sum((1-is.na(data2$WRKSTAT)),na.rm=TRUE),
                  mean(data2$WRKSTAT,na.rm=TRUE),
                  sd(data2$WRKSTAT,na.rm=TRUE),
                  min(data2$WRKSTAT,na.rm=TRUE),
                  max(data2$WRKSTAT,na.rm=TRUE))
row_empl <- round(row_empl,digits = 2)

byc_empl<-aggregate(data2$WRKSTAT, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_empl<-as.matrix(byc_empl[,2])
byc_empl<-round(byc_empl,digits = 2)


# region
table(data2$REGION) # 0  NOT ASSIGNED   1  NEW ENGLAND   2  MIDDLE ATLANTIC
# 3  E. NOR. CENTRAL  4  W. NOR. CENTRAL 5  SOUTH ATLANTIC
# 6  E. SOU. CENTRAL 7  W. SOU. CENTRAL 8  MOUNTAIN 9  PACIFIC
data2$REGION[data2$REGION==0] <- NA    
summary(data2$REGION)

# Categorical var, should translate into dummy variables
data2$region1<-data2$REGION
data2$region1[data2$REGION!=1]<-0
data2$region2<-data2$REGION
data2$region2[data2$REGION!=2]<-0
data2$region2[data2$REGION==2]<-1
data2$region3<-data2$REGION
data2$region3[data2$REGION!=3]<-0
data2$region3[data2$REGION==3]<-1
data2$region4<-data2$REGION
data2$region4[data2$REGION!=4]<-0
data2$region4[data2$REGION==4]<-1
data2$region5<-data2$REGION
data2$region5[data2$REGION!=5]<-0
data2$region5[data2$REGION==5]<-1
data2$region6<-data2$REGION
data2$region6[data2$REGION!=6]<-0
data2$region6[data2$REGION==6]<-1
data2$region7<-data2$REGION
data2$region7[data2$REGION!=7]<-0
data2$region7[data2$REGION==7]<-1
data2$region8<-data2$REGION
data2$region8[data2$REGION!=8]<-0
data2$region8[data2$REGION==8]<-1
data2$region9<-data2$REGION
data2$region9[data2$REGION!=9]<-0
data2$region9[data2$REGION==9]<-1


row_reg <- cbind(sum((1-is.na(data2$REGION)),na.rm=TRUE),
                 mean(data2$REGION,na.rm=TRUE),
                 sd(data2$REGION,na.rm=TRUE),
                 min(data2$REGION,na.rm=TRUE),
                 max(data2$REGION,na.rm=TRUE))
row_reg <- round(row_reg,digits = 2)

byc_reg<-aggregate(data2$REGION, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_reg<-as.matrix(byc_reg[,2])
byc_reg<-round(byc_reg,digits = 2)

# City size
table(data2$XNORCSIZ) # 0  NOT ASSIGNED   1  CITY GT 250000   2  CITY,50-250000   3  SUBURB, LRG CITY
# 4  SUBURB, MED CITY    5  UNINC,LRG CITY    6  UNINC,MED CITY     7  CITY,10-49999
# 8  TOWN GT 2500       9  SMALLER AREAS     10  OPEN COUNTRY
### in opposite order from the paper

######
# changed the order below, to be consistent with the paper:

data2$XNORCSIZ[data2$XNORCSIZ==0] <- NA   
data2$XNORCSIZ_reversed <- 11-data2$XNORCSIZ
table(data2$XNORCSIZ_reversed)

row_size <- cbind(sum((1-is.na(data2$XNORCSIZ_reversed)),na.rm=TRUE),
                  mean(data2$XNORCSIZ_reversed,na.rm=TRUE),
                  sd(data2$XNORCSIZ_reversed,na.rm=TRUE),
                  min(data2$XNORCSIZ_reversed,na.rm=TRUE),
                  max(data2$XNORCSIZ_reversed,na.rm=TRUE))
row_size <- round(row_size,digits = 2)

byc_size<-aggregate(data2$XNORCSIZ_reversed, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_size<-as.matrix(byc_size[,2])
byc_size<-round(byc_size,digits = 2)


# occupation prestige  
# coding methods : OCC, 1972-1991; Prestige, 1972-1990
#                 OCC80, 1988-2010; Prestige, 1988-2010
#                 overlap: 1988,1989,1990
#===============================================================
table(data2$OCC)   # 0: 25684
table(data2$PRESTIGE)# 1970   

table(data2$OCC80)  # 0: 19986
table(data2$PRESTIGE80)# 1980

table(data2$PRESTIGE[data2$YEAR==1990])
table(data2$PRESTG80[data2$YEAR==1990])

# method1: 1988 as cutting point, 1972-1987 old code, 1988-2008 new code
data2$occ1<-data2$OCC
data2$occ1[data2$YEAR>=1988]<-data2$OCC80[data2$YEAR>=1988]
data2$occ1[data2$occ1!=0]<-1 
sd(data2$occ1)
summary(data2$occ1) # sd: 0.2084773    mean: 0.9545  close to the paper 0.22, 0.95 

data2$prest1<-data2$PRESTIGE
data2$prest1[data2$YEAR>=1988]<-data2$PRESTG80[data2$YEAR>=1988]
data2$prest1<-data2$prest1/10
data2$prest1[data2$prest1==0]<-NA
sd(data2$prest1,na=TRUE)
summary(data2$prest1, na=T)  # sd: 1.404535  mean: 4.242  close to the paper 1.37, 4.23

row_occ1 <- cbind(sum((1-is.na(data2$occ1)),na.rm=TRUE),
                  mean(data2$occ1,na.rm=TRUE),
                  sd(data2$occ1,na.rm=TRUE),
                  min(data2$occ1,na.rm=TRUE),
                  max(data2$occ1,na.rm=TRUE))
row_occ1 <- round(row_occ1,digits = 2)

row_prest1 <- cbind(sum((1-is.na(data2$prest1)),na.rm=TRUE),
                    mean(data2$prest1,na.rm=TRUE),
                    sd(data2$prest1,na.rm=TRUE),
                    min(data2$prest1,na.rm=TRUE),
                    max(data2$prest1,na.rm=TRUE))
row_prest1 <- round(row_prest1,digits = 2)


# assign respective cateory's midpoint to income
data2$income<-data2$INCOME72
summary(data2$income)
data2$income[data2$INCOME72>90|data2$INCOME>90|data2$INCOME77>90|data2$INCOME82>90|data2$INCOME86>90|data2$INCOME91>90|data2$INCOME98>90|data2$INCOME06>90]<-NA

data2$income[data2$INCOME72==1&data2$YEAR==1972]<-1000
data2$income[data2$INCOME72==2&data2$YEAR==1972]<-3000
data2$income[data2$INCOME72==3&data2$YEAR==1972]<-5000
data2$income[data2$INCOME72==4&data2$YEAR==1972]<-7000
data2$income[data2$INCOME72==5&data2$YEAR==1972]<-9000
data2$income[data2$INCOME72==6&data2$YEAR==1972]<-11250
data2$income[data2$INCOME72==7&data2$YEAR==1972]<-13750
data2$income[data2$INCOME72==8&data2$YEAR==1972]<-16250
data2$income[data2$INCOME72==9&data2$YEAR==1972]<-18750
data2$income[data2$INCOME72==10&data2$YEAR==1972]<-22500
data2$income[data2$INCOME72==11&data2$YEAR==1972]<-27500
data2$income[data2$INCOME72==12&data2$YEAR==1972]<-37500  
data2$income[data2$INCOME72==13&data2$YEAR==1972]<-NA    

data2$income[data2$INCOME==1&data2$YEAR<=1976&data2$YEAR>=1973]<-500
data2$income[data2$INCOME==2&data2$YEAR<=1976&data2$YEAR>=1973]<-2000
data2$income[data2$INCOME==3&data2$YEAR<=1976&data2$YEAR>=1973]<-3500
data2$income[data2$INCOME==4&data2$YEAR<=1976&data2$YEAR>=1973]<-4500
data2$income[data2$INCOME==5&data2$YEAR<=1976&data2$YEAR>=1973]<-5500
data2$income[data2$INCOME==6&data2$YEAR<=1976&data2$YEAR>=1973]<-6500
data2$income[data2$INCOME==7&data2$YEAR<=1976&data2$YEAR>=1973]<-7500
data2$income[data2$INCOME==8&data2$YEAR<=1976&data2$YEAR>=1973]<-9000
data2$income[data2$INCOME==9&data2$YEAR<=1976&data2$YEAR>=1973]<-12500
data2$income[data2$INCOME==10&data2$YEAR<=1976&data2$YEAR>=1973]<-17500
data2$income[data2$INCOME==11&data2$YEAR<=1976&data2$YEAR>=1973]<-22500
data2$income[data2$INCOME==12&data2$YEAR<=1976&data2$YEAR>=1973]<-31250 
data2$income[data2$INCOME==13&data2$YEAR<=1976&data2$YEAR>=1973]<-NA   

data2$income[data2$INCOME77==1&data2$YEAR<=1980&data2$YEAR>=1977]<-500
data2$income[data2$INCOME77==2&data2$YEAR<=1980&data2$YEAR>=1977]<-2000
data2$income[data2$INCOME77==3&data2$YEAR<=1980&data2$YEAR>=1977]<-3500
data2$income[data2$INCOME77==4&data2$YEAR<=1980&data2$YEAR>=1977]<-4500
data2$income[data2$INCOME77==5&data2$YEAR<=1980&data2$YEAR>=1977]<-5500
data2$income[data2$INCOME77==6&data2$YEAR<=1980&data2$YEAR>=1977]<-6500
data2$income[data2$INCOME77==7&data2$YEAR<=1980&data2$YEAR>=1977]<-7500
data2$income[data2$INCOME77==8&data2$YEAR<=1980&data2$YEAR>=1977]<-9000
data2$income[data2$INCOME77==9&data2$YEAR<=1980&data2$YEAR>=1977]<-11250
data2$income[data2$INCOME77==10&data2$YEAR<=1980&data2$YEAR>=1977]<-13750
data2$income[data2$INCOME77==11&data2$YEAR<=1980&data2$YEAR>=1977]<-16250
data2$income[data2$INCOME77==12&data2$YEAR<=1980&data2$YEAR>=1977]<-18750
data2$income[data2$INCOME77==13&data2$YEAR<=1980&data2$YEAR>=1977]<-21250
data2$income[data2$INCOME77==14&data2$YEAR<=1980&data2$YEAR>=1977]<-23750
data2$income[data2$INCOME77==15&data2$YEAR<=1980&data2$YEAR>=1977]<-37500
data2$income[data2$INCOME77==16&data2$YEAR<=1980&data2$YEAR>=1977]<-62500
data2$income[data2$INCOME77==17&data2$YEAR<=1980&data2$YEAR>=1977]<-NA

data2$income[data2$INCOME82==1&data2$YEAR<=1985&data2$YEAR>=1982]<-500
data2$income[data2$INCOME82==2&data2$YEAR<=1985&data2$YEAR>=1982]<-2000
data2$income[data2$INCOME82==3&data2$YEAR<=1985&data2$YEAR>=1982]<-3500
data2$income[data2$INCOME82==4&data2$YEAR<=1985&data2$YEAR>=1982]<-4500
data2$income[data2$INCOME82==5&data2$YEAR<=1985&data2$YEAR>=1982]<-5500
data2$income[data2$INCOME82==6&data2$YEAR<=1985&data2$YEAR>=1982]<-6500
data2$income[data2$INCOME82==7&data2$YEAR<=1985&data2$YEAR>=1982]<-7500
data2$income[data2$INCOME82==8&data2$YEAR<=1985&data2$YEAR>=1982]<-9000
data2$income[data2$INCOME82==9&data2$YEAR<=1985&data2$YEAR>=1982]<-11250
data2$income[data2$INCOME82==10&data2$YEAR<=1985&data2$YEAR>=1982]<-13750
data2$income[data2$INCOME82==11&data2$YEAR<=1985&data2$YEAR>=1982]<-16250
data2$income[data2$INCOME82==12&data2$YEAR<=1985&data2$YEAR>=1982]<-18750
data2$income[data2$INCOME82==13&data2$YEAR<=1985&data2$YEAR>=1982]<-21250
data2$income[data2$INCOME82==14&data2$YEAR<=1985&data2$YEAR>=1982]<-23750
data2$income[data2$INCOME82==15&data2$YEAR<=1985&data2$YEAR>=1982]<-30000
data2$income[data2$INCOME82==16&data2$YEAR<=1985&data2$YEAR>=1982]<-42500
data2$income[data2$INCOME82==17&data2$YEAR<=1985&data2$YEAR>=1982]<-62500
data2$income[data2$INCOME82==18&data2$YEAR<=1985&data2$YEAR>=1982]<-NA

data2$income[data2$INCOME86==1&data2$YEAR<=1990&data2$YEAR>=1986]<-500
data2$income[data2$INCOME86==2&data2$YEAR<=1990&data2$YEAR>=1986]<-2000
data2$income[data2$INCOME86==3&data2$YEAR<=1990&data2$YEAR>=1986]<-3500
data2$income[data2$INCOME86==4&data2$YEAR<=1990&data2$YEAR>=1986]<-4500
data2$income[data2$INCOME86==5&data2$YEAR<=1990&data2$YEAR>=1986]<-5500
data2$income[data2$INCOME86==6&data2$YEAR<=1990&data2$YEAR>=1986]<-6500
data2$income[data2$INCOME86==7&data2$YEAR<=1990&data2$YEAR>=1986]<-7500
data2$income[data2$INCOME86==8&data2$YEAR<=1990&data2$YEAR>=1986]<-9000
data2$income[data2$INCOME86==9&data2$YEAR<=1990&data2$YEAR>=1986]<-11250
data2$income[data2$INCOME86==10&data2$YEAR<=1990&data2$YEAR>=1986]<-13750
data2$income[data2$INCOME86==11&data2$YEAR<=1990&data2$YEAR>=1986]<-16250
data2$income[data2$INCOME86==12&data2$YEAR<=1990&data2$YEAR>=1986]<-18750
data2$income[data2$INCOME86==13&data2$YEAR<=1990&data2$YEAR>=1986]<-21250
data2$income[data2$INCOME86==14&data2$YEAR<=1990&data2$YEAR>=1986]<-23750
data2$income[data2$INCOME86==15&data2$YEAR<=1990&data2$YEAR>=1986]<-27500
data2$income[data2$INCOME86==16&data2$YEAR<=1990&data2$YEAR>=1986]<-32500
data2$income[data2$INCOME86==17&data2$YEAR<=1990&data2$YEAR>=1986]<-37500
data2$income[data2$INCOME86==18&data2$YEAR<=1990&data2$YEAR>=1986]<-45000
data2$income[data2$INCOME86==19&data2$YEAR<=1990&data2$YEAR>=1986]<-55000
data2$income[data2$INCOME86==20&data2$YEAR<=1990&data2$YEAR>=1986]<-75000
data2$income[data2$INCOME86==21&data2$YEAR<=1990&data2$YEAR>=1986]<- NA

data2$income[data2$INCOME91==1&data2$YEAR<=1997&data2$YEAR>=1991]<-500
data2$income[data2$INCOME91==2&data2$YEAR<=1997&data2$YEAR>=1991]<-2000
data2$income[data2$INCOME91==3&data2$YEAR<=1997&data2$YEAR>=1991]<-3500
data2$income[data2$INCOME91==4&data2$YEAR<=1997&data2$YEAR>=1991]<-4500
data2$income[data2$INCOME91==5&data2$YEAR<=1997&data2$YEAR>=1991]<-5500
data2$income[data2$INCOME91==6&data2$YEAR<=1997&data2$YEAR>=1991]<-6500
data2$income[data2$INCOME91==7&data2$YEAR<=1997&data2$YEAR>=1991]<-7500
data2$income[data2$INCOME91==8&data2$YEAR<=1997&data2$YEAR>=1991]<-9000
data2$income[data2$INCOME91==9&data2$YEAR<=1997&data2$YEAR>=1991]<-11250
data2$income[data2$INCOME91==10&data2$YEAR<=1997&data2$YEAR>=1991]<-13750
data2$income[data2$INCOME91==11&data2$YEAR<=1997&data2$YEAR>=1991]<-16250
data2$income[data2$INCOME91==12&data2$YEAR<=1997&data2$YEAR>=1991]<-18750
data2$income[data2$INCOME91==13&data2$YEAR<=1997&data2$YEAR>=1991]<-21250
data2$income[data2$INCOME91==14&data2$YEAR<=1997&data2$YEAR>=1991]<-23750
data2$income[data2$INCOME91==15&data2$YEAR<=1997&data2$YEAR>=1991]<-27500
data2$income[data2$INCOME91==16&data2$YEAR<=1997&data2$YEAR>=1991]<-32500
data2$income[data2$INCOME91==17&data2$YEAR<=1997&data2$YEAR>=1991]<-37500
data2$income[data2$INCOME91==18&data2$YEAR<=1997&data2$YEAR>=1991]<-45000
data2$income[data2$INCOME91==19&data2$YEAR<=1997&data2$YEAR>=1991]<-55000
data2$income[data2$INCOME91==20&data2$YEAR<=1997&data2$YEAR>=1991]<-67500
data2$income[data2$INCOME91==21&data2$YEAR<=1997&data2$YEAR>=1991]<-93750
data2$income[data2$INCOME91==22&data2$YEAR<=1997&data2$YEAR>=1991]<-NA

data2$income[data2$INCOME98==1&data2$YEAR<=2005&data2$YEAR>=1998]<-500
data2$income[data2$INCOME98==2&data2$YEAR<=2005&data2$YEAR>=1998]<-2000
data2$income[data2$INCOME98==3&data2$YEAR<=2005&data2$YEAR>=1998]<-3500
data2$income[data2$INCOME98==4&data2$YEAR<=2005&data2$YEAR>=1998]<-4500
data2$income[data2$INCOME98==5&data2$YEAR<=2005&data2$YEAR>=1998]<-5500
data2$income[data2$INCOME98==6&data2$YEAR<=2005&data2$YEAR>=1998]<-6500
data2$income[data2$INCOME98==7&data2$YEAR<=2005&data2$YEAR>=1998]<-7500
data2$income[data2$INCOME98==8&data2$YEAR<=2005&data2$YEAR>=1998]<-9000
data2$income[data2$INCOME98==9&data2$YEAR<=2005&data2$YEAR>=1998]<-11250
data2$income[data2$INCOME98==10&data2$YEAR<=2005&data2$YEAR>=1998]<-13750
data2$income[data2$INCOME98==11&data2$YEAR<=2005&data2$YEAR>=1998]<-16250
data2$income[data2$INCOME98==12&data2$YEAR<=2005&data2$YEAR>=1998]<-18750
data2$income[data2$INCOME98==13&data2$YEAR<=2005&data2$YEAR>=1998]<-21250
data2$income[data2$INCOME98==14&data2$YEAR<=2005&data2$YEAR>=1998]<-23750
data2$income[data2$INCOME98==15&data2$YEAR<=2005&data2$YEAR>=1998]<-27500
data2$income[data2$INCOME98==16&data2$YEAR<=2005&data2$YEAR>=1998]<-32500
data2$income[data2$INCOME98==17&data2$YEAR<=2005&data2$YEAR>=1998]<-37500
data2$income[data2$INCOME98==18&data2$YEAR<=2005&data2$YEAR>=1998]<-45000
data2$income[data2$INCOME98==19&data2$YEAR<=2005&data2$YEAR>=1998]<-55000
data2$income[data2$INCOME98==20&data2$YEAR<=2005&data2$YEAR>=1998]<-67500
data2$income[data2$INCOME98==21&data2$YEAR<=2005&data2$YEAR>=1998]<-82500
data2$income[data2$INCOME98==22&data2$YEAR<=2005&data2$YEAR>=1998]<-100000
data2$income[data2$INCOME98==23&data2$YEAR<=2005&data2$YEAR>=1998]<-137500
data2$income[data2$INCOME98==24&data2$YEAR<=2005&data2$YEAR>=1998]<-NA

data2$income[data2$INCOME06==1&data2$YEAR>=2006]<-500
data2$income[data2$INCOME06==2&data2$YEAR>=2006]<-2000
data2$income[data2$INCOME06==3&data2$YEAR>=2006]<-3500
data2$income[data2$INCOME06==4&data2$YEAR>=2006]<-4500
data2$income[data2$INCOME06==5&data2$YEAR>=2006]<-5500
data2$income[data2$INCOME06==6&data2$YEAR>=2006]<-6500
data2$income[data2$INCOME06==7&data2$YEAR>=2006]<-7500
data2$income[data2$INCOME06==8&data2$YEAR>=2006]<-9000
data2$income[data2$INCOME06==9&data2$YEAR>=2006]<-11250
data2$income[data2$INCOME06==10&data2$YEAR>=2006]<-13750
data2$income[data2$INCOME06==11&data2$YEAR>=2006]<-16250
data2$income[data2$INCOME06==12&data2$YEAR>=2006]<-18750
data2$income[data2$INCOME06==13&data2$YEAR>=2006]<-21250
data2$income[data2$INCOME06==14&data2$YEAR>=2006]<-23750
data2$income[data2$INCOME06==15&data2$YEAR>=2006]<-27500
data2$income[data2$INCOME06==16&data2$YEAR>=2006]<-32500
data2$income[data2$INCOME06==17&data2$YEAR>=2006]<-37500
data2$income[data2$INCOME06==18&data2$YEAR>=2006]<-45000
data2$income[data2$INCOME06==19&data2$YEAR>=2006]<-55000
data2$income[data2$INCOME06==20&data2$YEAR>=2006]<-67500
data2$income[data2$INCOME06==21&data2$YEAR>=2006]<-82500
data2$income[data2$INCOME06==22&data2$YEAR>=2006]<-100000
data2$income[data2$INCOME06==23&data2$YEAR>=2006]<-120000
data2$income[data2$INCOME06==24&data2$YEAR>=2006]<-140000
data2$income[data2$INCOME06==25&data2$YEAR>=2006]<-187500
data2$income[data2$INCOME06==26&data2$YEAR>=2006]<-NA

summary(data2$income)
sd(data2$income,na.rm=T)

# adjust for inflation, 2000 <- 1, "THE PAPER DOES NOT CLARIFY WHICH YEAR IS THE BASE YEAR"
# check: the CPI downloaded from BLS is different from Hout's paper.
table(data2$YEAR)
data2$income.adj<-data2$income
data2$income.adj[data2$YEAR==1972]<-data2$income[data2$YEAR==1972]/0.225  # inflation 1971
data2$income.adj[data2$YEAR==1973]<-data2$income[data2$YEAR==1973]/0.232
data2$income.adj[data2$YEAR==1974]<-data2$income[data2$YEAR==1974]/0.247
data2$income.adj[data2$YEAR==1975]<-data2$income[data2$YEAR==1975]/0.271
data2$income.adj[data2$YEAR==1976]<-data2$income[data2$YEAR==1976]/0.293
data2$income.adj[data2$YEAR==1977]<-data2$income[data2$YEAR==1977]/0.310
data2$income.adj[data2$YEAR==1978]<-data2$income[data2$YEAR==1978]/0.330
data2$income.adj[data2$YEAR==1980]<-data2$income[data2$YEAR==1980]/0.386
data2$income.adj[data2$YEAR==1982]<-data2$income[data2$YEAR==1982]/0.470
data2$income.adj[data2$YEAR==1983]<-data2$income[data2$YEAR==1983]/0.498
data2$income.adj[data2$YEAR==1984]<-data2$income[data2$YEAR==1984]/0.520
data2$income.adj[data2$YEAR==1985]<-data2$income[data2$YEAR==1985]/0.541
data2$income.adj[data2$YEAR==1986]<-data2$income[data2$YEAR==1986]/0.560
data2$income.adj[data2$YEAR==1987]<-data2$income[data2$YEAR==1987]/0.570
data2$income.adj[data2$YEAR==1988]<-data2$income[data2$YEAR==1988]/0.589
data2$income.adj[data2$YEAR==1989]<-data2$income[data2$YEAR==1989]/0.611
data2$income.adj[data2$YEAR==1990]<-data2$income[data2$YEAR==1990]/0.637
data2$income.adj[data2$YEAR==1991]<-data2$income[data2$YEAR==1991]/0.669
data2$income.adj[data2$YEAR==1993]<-data2$income[data2$YEAR==1993]/0.710
data2$income.adj[data2$YEAR==1994]<-data2$income[data2$YEAR==1994]/0.728
data2$income.adj[data2$YEAR==1996]<-data2$income[data2$YEAR==1996]/0.761
data2$income.adj[data2$YEAR==1998]<-data2$income[data2$YEAR==1998]/0.798
data2$income.adj[data2$YEAR==2000]<-data2$income[data2$YEAR==2000]/0.826
data2$income.adj[data2$YEAR==2002]<-data2$income[data2$YEAR==2002]/0.878
data2$income.adj[data2$YEAR==2004]<-data2$income[data2$YEAR==2004]/0.912
data2$income.adj[data2$YEAR==2006]<-data2$income[data2$YEAR==2006]/0.968
data2$income.adj[data2$YEAR==2008]<-data2$income[data2$YEAR==2008]/1.028


data2$income.adj<-data2$income.adj/1000  # as in the paper, unit: 1,000

summary(data2$income.adj)
sd(data2$income.adj,na.rm=TRUE)

row_income <- cbind(sum((1-is.na(data2$income.adj)),na.rm=TRUE),
                    mean(data2$income.adj,na.rm=TRUE),
                    sd(data2$income.adj,na.rm=TRUE),
                    min(data2$income.adj,na.rm=TRUE),
                    max(data2$income.adj,na.rm=TRUE))
row_income <- round(row_income,digits = 2)

byc_income<-aggregate(data2$income.adj, by=list(Cohort_group=data2$cohort.group), mean,na.rm=TRUE)
byc_income<-as.matrix(byc_income[,2])
byc_income<-round(byc_income,digits = 2)


# grand-mean centered
# data2$relative.support.dem<-data2$relative.support-mean(data2$relative.support) # DV shoud not be centered
# data2$AgeGroup.c<-scale(data2$AgeGroup, scale=F)
# data2$YEAR.c<-scale(data2$YEAR,scale=F)
data2$male.c<-scale(data2$male,scale=F)
data2$race.black.c<-scale(data2$race.black,scale=F)
data2$race.other.c<-scale(data2$race.other,scale=F)
data2$cohort.c<-scale(data2$cohort,scale=F)
data2$cohort_sqr.c<-scale(data2$cohort_sqr, scale=F) # I suspect this is the authors' way, which is wrong.
data2$cohort_sqr.c2<-data2$cohort.c^2   # should center x first, then generate x^2 
cor(data2$cohort_sqr.c, data2$cohort.c)
cor(data2$cohort_sqr.c2, data2$cohort.c)

data2$education.c<-scale(data2$education,scale=F)
data2$educ.center.c<-scale(data2$educ.center,scale=F)
data2$married.c<-scale(data2$MARITAL,scale=F)
data2$child.c<-scale(data2$CHILDS,scale=F)
data2$size.c<-scale(data2$XNORCSIZ_reverse,scale=F)
data2$empl.c<-scale(data2$WRKSTAT,scale=F)
data2$prest1.c<-scale(data2$prest1,scale=F)
data2$occ1.c<-scale(data2$occ1,scale=F)
data2$income.adj.c<-scale(data2$income.adj,scale=F)
data2$region1.c<-scale(data2$region1,scale=F)
data2$region2.c<-scale(data2$region2,scale=F)
data2$region3.c<-scale(data2$region3,scale=F)
data2$region4.c<-scale(data2$region4,scale=F)
data2$region5.c<-scale(data2$region5,scale=F)
data2$region6.c<-scale(data2$region6,scale=F)
data2$region7.c<-scale(data2$region7,scale=F)
data2$region8.c<-scale(data2$region8,scale=F)
data2$region9.c<-scale(data2$region9,scale=F)

##############  Tables
# table 1: descriptive

table1<-matrix(c(row_relativesupport,row_absolutesupport,row_allspending,row_cohort,row_age,row_year,row_male,row_raceb,row_raceo,row_edu,row_married,row_child,row_reg,row_size,row_empl,row_prest1,row_occ1,row_income),ncol=5,nrow=18,byrow=TRUE)
dimnames(table1)<-list(c("Relative support","Absolute support", "All spending", "Cohort","Age","Year","Gender male","Race black","Race other","Education years/10","Married","No. of Children","Region","City size","Employed/in school","Occupation prestige/10","Reports occupation","Family income"),c("N","Mean","SD","Min","Max"))

library(stargazer)
stargazer(table1, title="Variable Descriptive Statistics - All Cohorts",
          align=FALSE, digits=2, no.space=TRUE)

table2<-matrix(c(byc_relativesupport,byc_absolutesupport,byc_allspending,byc_cohort,byc_age,byc_year,byc_male,byc_raceb,byc_raceo,byc_edu,byc_married,byc_child,byc_reg,byc_size,byc_empl,byc_prest1,byc_occ1,byc_income),ncol=4,nrow=18,byrow=TRUE)
dimnames(table2)<-list(c("Relative support","Absolute support", "All spending", "Cohort","Age","Year","Gender male","Race black","Race other","Education years/10","Married","No. of Children","Region","City size","Employed/in school","Occupation prestige/10","Reports occupation","Family income"),c("1900-1929","1930-49","1950-69","1970-88"))

library(stargazer)
stargazer(table2, title="Variable Descriptive Statistics - Means by Cohort",
          align=FALSE, digits=2, no.space=TRUE)

#Hierarchical Linear Modeling

### table 2
fm1 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + education.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c 
            + ( 1| AgeGroup) + (1|YEAR), data2, REML=FALSE)
summary(fm1)

fm2 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + education.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(fm2)

fm3 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + educ.center.c + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c 
            +( 1| AgeGroup) + (1|YEAR), data2, REML=FALSE)
summary(fm3)

fm4 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + educ.center.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(fm4)

#### table 3
fm5 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + education.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            + cohort.c:education.c + cohort_sqr.c:education.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(fm5)

fm6 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + educ.center.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            + cohort.c:educ.center.c + cohort_sqr.c:educ.center.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(fm6)

fm7 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + education.c
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            + cohort.c:prest1.c + cohort_sqr.c:prest1.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(fm7)

fm8 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + education.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            + cohort.c:income.adj.c + cohort_sqr.c:income.adj.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(fm8)

#library(foreign)
#write.dta(data2, "data2_STATA.dta")


library(stargazer)
stargazer(fm1,fm2,fm3,fm4, title="Regression Replication Results of Table 2 in the Paper",
          align=TRUE, no.space=TRUE)


###########################################################################################################################################


# 1. cohort_sqr.c2, grand-mean center: beta4<0, significant, different from the paper, yeah!
# findings: across all cohorts, SES*COHORT<0

# data2$cohort_sqr.c<-scale(data2$cohort_sqr, scale=F) # I suspect this is the authors' way, which is wrong.
# data2$cohort_sqr.c2<-data2$cohort.c^2   # should center x first, then generate x^2 

# cor(data2$cohort_sqr.c, data2$cohort.c) # 0.9661658
# cor(data2$cohort_sqr.c2, data2$cohort.c) # -0.2595254

avrg_relativesupport<-aggregate(data2$relative.support, by=list(data2$cohort), mean,na.rm=TRUE)
plot(avrg_relativesupport$Group.1,avrg_relativesupport$x)
boxcox(relative.support~ cohort.c, data = data2,
       lambda = seq(-2, 2, len = 20))  # yeah, quadratic relationship

library(lme4)
m1 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2 + education.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(m1) # consistent with paper

m2 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2 + education.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            + cohort.c:education.c + cohort_sqr.c2:education.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(m2) # inconsistent
print(summary(m2), correlation=TRUE)

m3 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2 + educ.center.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            + cohort.c:educ.center.c + cohort_sqr.c2:educ.center.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(m3) # inconsistent

m4 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2 + education.c
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            + cohort.c:prest1.c + cohort_sqr.c2:prest1.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(m4) # inconsistent

m5 <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2 + education.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            + cohort.c:income.adj.c + cohort_sqr.c2:income.adj.c
            +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(m5) # inconsistent


## 4. ordered logit  # NOT SURE, "CHECK", SHOULD WE CONTROL FOR TOTAL.SPENDING?

z.ol.out1 <- zelig(as.factor(absolute.support)~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + education.c 
                   + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c,
                   model="ologit", robust=TRUE, cluster="YEAR", data=data2)
summary(z.ol.out1) 
# the effect of race.black changes from negative to positive


z.ol.out2 <- zelig(as.factor(absolute.support)~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2+ education.c 
             + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
             + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c,
             model="ologit", robust=TRUE, cluster="YEAR", data=data2)
summary(z.ol.out2) 
# the effect of race.black again changes from negative to positive
# the coefficient of education increases by 20 times
# indicating that the effect of education on absolute spending is actually significantly higher than it was originally modelled


#x.ol.out2<- setx(z.ol.out2, fn=NULL)
#s.ol.out2<- sim(z.ol.out2, x = x.out)
#summary(s.ol.out2)
#plot(s.ol.out2)

z.ol.out3 <- zelig(as.factor(absolute.support)~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2+ educ.center.c 
                   + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c,
                   model="ologit", robust=TRUE, cluster="YEAR", data=data2)
summary(z.ol.out3) 
# the effect of race.black again changes from negative to positive
# the coefficient of cohort increases by about 10 times


z.ol.out4 <- zelig(as.factor(absolute.support)~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2+ educ.center.c 
                   + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
                   + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c,
                   model="ologit", robust=TRUE, cluster="YEAR", data=data2)
summary(z.ol.out4) 
# the effect of race.black again changes from negative to positive
# the coefficient of educ.center increases by 20 times; and cohort by 10 times


## 5. robust standard error

fm1.r <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + education.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c 
            + ( 1| AgeGroup) + (1|YEAR), data2, robust=TRUE, REML=FALSE)
summary(fm1.r)
# slight change of figures, all signs remain the same

fm2.r <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + education.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            +( 1| AgeGroup) + (1|YEAR), data2,  robust=TRUE, REML = FALSE)
summary(fm2.r)


fm3.r <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + educ.center.c + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c 
            +( 1| AgeGroup) + (1|YEAR), data2,  robust=TRUE, REML=FALSE)
summary(fm3.r)

fm4.r <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c + educ.center.c 
            + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c
            + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
            +( 1| AgeGroup) + (1|YEAR), data2,  robust=TRUE, REML = FALSE)
summary(fm4.r)

# conclusion: No serious misspecification was found


# 9. partisan stance, democratic environmentalism

View(data2$PARTYID)
table(data2$PARTYID)
# 0  STRONG DEMOCRAT
# 1  NOT STR DEMOCRAT
# 2  IND,NEAR DEM
# 3  INDEPENDENT
# 4  IND,NEAR REP
# 5  NOT STR REPUBLICAN
# 6  STRONG REPUBLICAN
# 7  OTHER PARTY
# 8  DK
# 9  NA
aggregate(data2$relative.support,by=list(party=data2$PARTYID), mean, na.rm=T)
### cannot find the specific time when the democratic party took on environmentalism

##################################################################################################################
# 10. World Value Survey


#####################################
## simulation of coefficients
## ICs, graphs

# y normal distributed
# link function: miu=Xb; variance=

# fundamental uncertainty: y normal sampling

# estimation uncertainty: beta optimization
# use loglikelihood of linear model to simulate first

z.out <- zelig(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2 + educ.center.c 
          + married.c + child.c + size.c + empl.c + prest1.c + income.adj.c 
          + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c
          + cohort.c:educ.center.c + cohort_sqr.c2:educ.center.c, model="normal", robust=TRUE, cluster="YEAR", data=data2)
summary(z.out) 
x.out<- setx(z.out, fn=NULL)
s.out<- sim(z.out, x = x.out) 
summary(s.out)
names(s.out)
s.out$result


### simulation of cohort data

# generate cohort values with interval .1   
set.seed(02138)

data2$COHORT.T<-data2$COHORT-1900

data2$cohort.sim2<-as.integer(rnorm(47259, mean(data2$COHORT.T), sd(data2$COHORT.T)))
mean(data2$cohort.sim2)
sd(data2$cohort.sim2)
hist(data2$cohort.sim2)
sd(data2$COHORT.T)

data2$cohort.sim2[data2$cohort.sim2<0] <- 0
data2$cohort.sim2[data2$cohort.sim2>82] <- 82
data2$cohort.sim<-data2$cohort.sim2/10

# new
data2$cohort.sim.c<-scale(data2$cohort.sim,scale=F)
data2$cohort_sqr.sim.c2<-data2$cohort.sim.c^2   # should center x first, then generate x^2 
cor(data2$cohort.sim.c,data2$cohort_sqr.sim.c2)

m2.sim <- lmer(relative.support ~ male.c + race.black.c + race.other.c + cohort.sim.c + cohort_sqr.sim.c2 + education.c 
           + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
           + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
           + cohort.sim.c:education.c + cohort_sqr.sim.c2:education.c
           +( 1| AgeGroup) + (1|YEAR), data2, REML = FALSE)
summary(m2.sim) 



## test coefficient of linear RE model

m6 <- glm(relative.support ~ male.c + race.black.c + race.other.c + cohort.c + cohort_sqr.c2 + educ.center.c 
           + married.c + child.c + size.c + empl.c + prest1.c + occ1.c + income.adj.c 
           + region1.c + region2.c + region3.c + region4.c + region5.c + region6.c + region7.c + region8.c + region9.c
           + cohort.c:educ.center.c + cohort_sqr.c2:educ.center.c, family=gaussian,cluster=data2$YEAR, data2)
summary(m6) 


hist(data2$relative.support)
plot(data2$relative.support)
table(data2$absolute.support)
table(data2$all.spending)
sum(data2$all.spending)

